package com.dhm.wordcount;

import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.IndexTokenizer;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.springframework.core.io.ClassPathResource;

import java.io.File;
import java.io.IOException;
import java.util.*;

/**
 * @author duhongming
 * @version 1.0
 * @description 对标题分词、去除停用词、去除重复行
 * @date 2019/11/8 13:41
 */
public class HeadLineTokenizer {
    //自定义停用词
    static {
        CoreStopWordDictionary.add("图");
        CoreStopWordDictionary.add("图表");
        CoreStopWordDictionary.add("%");
        CoreStopWordDictionary.add("-");
    }

    public static void main(String[] args) throws IOException {
//        //1. 第一个Demo
//        System.out.println(HanLP.segment("你好，欢迎使用HanLP汉语处理包！"));
//        //2. 标准分词
//        System.out.println(StandardTokenizer.segment("商品和服务"));
//        //3. NLP分词
//        System.out.println(NLPTokenizer.segment("我新造一个词叫幻想乡你能识别并标注正确词性吗？"));
//        //4. 索引分词
//        for (Term term : IndexTokenizer.segment("主副食品")) {
//            System.out.println(term.word);
//        }

        ClassPathResource resource = new ClassPathResource("static/data/wordcount/titles.txt");
        List<String> titles = IOUtils.readLines(resource.getInputStream());
        //对标题去重
        Set<String> uniqueTitles = new HashSet<>(titles);

        File outWordsFile = new File(resource.getURI().getPath().replace(resource.getFilename(),"") + "words.txt");

        List<String> words = new LinkedList<>();
        for (int i = 0; i < uniqueTitles.size(); i++) {
            words.add(getIndexTokenizer(titles.get(i)));
            System.out.println(i + " line = " + titles.get(i));
        }

        FileUtils.writeLines(outWordsFile, words);
    }

    /**
     * 分词+去除停用词
     *
     * @param line
     * @return
     */
    private static String getIndexTokenizer(String line) {

        List<Term> terms = IndexTokenizer.segment(line);

        // 可以对任意分词器的结果执行过滤
        CoreStopWordDictionary.apply(terms);

        StringJoiner sj = new StringJoiner("\t");
        for (Term term : terms) {
            sj.add(term.word);
        }

        return sj.toString();
    }
}